library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(flextable)
library(officer)
library(tibble)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(pheatmap)
library(caret)
## Loading required package: lattice
library(corrplot)
## corrplot 0.94 loaded
library(randomForestSRC)
##
## randomForestSRC 3.3.1
##
## Type rfsrc.news() to see new features, changes, and bug fixes.
##
library(survival)
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
# Read data and rename columns
df <- readxl::read_excel("./data/AI_Tcells_DB.xlsx")
# Check duplicates
duplicates <- df %>%
group_by(ID, Time_OS, cGVHD_time, Names) %>%
filter(n() > 1) %>%
ungroup()
# Clean the data
df_clean <- df %>%
# Rename columns
rename(
Subject_ID = ID,
Observation_Days_Count = Time_OS,
cGVHD_Diagnosis_Day = cGVHD_time,
Cell_Count = Abs_Value
) %>%
mutate(
# Add flag to indicate which patients experienced cGVHD
cGVHD_flag = as.numeric(ifelse(!is.na(cGVHD_Diagnosis_Day), 1, 0)),
Subject_ID = as.factor(Subject_ID),
# Add blood test group
Blood_test_day = case_when(
grepl('ДЕНЬ ХРРТПХ_ПЕР.КРОВЬ', df$Names) ~ "cGVHD",
grepl('ДЕНЬ ХРРТПХ +', df$Names) ~ paste0(str_extract(df$Names, "(?<=\\+)\\d+(?=_)"), "_cGVHD"),
TRUE ~ str_extract(df$Names, "(?<=\\+)\\d+(?=_)")
),
# Create exact cell name
Cell_name = str_extract(Names, "(?<=/).*")
) %>%
select(-Names)
unify_cell_name <- function(cell_name, replacements) {
cell_name_unified <- cell_name
for (replacement in replacements) {
old_value <- replacement[1]
new_value <- replacement[2]
old_value <- str_replace_all(old_value, "([\\+\\*\\?\\|\\(\\)\\[\\]\\{\\}\\^\\$\\.])", "\\\\\\1")
cell_name_unified <- str_replace_all(cell_name_unified, old_value, new_value)
}
return(cell_name_unified)
}
replacements <- list(
c("PD1", "PD-1"),
c("СТАР2", "STAR2"),
c("4", "CD4_"),
c("CD4_+", "CD4+_"),
c("8", "CD8_"),
c("CD8_+", "CD8+_"),
c("Th", "TH"),
c("__", "_"),
c("_ ", "_")
)
# Apply the function to dataframe
df_clean <- df_clean %>%
mutate(
Cell_name_unified = Cell_name,
Cell_name_unified = unify_cell_name(Cell_name, replacements)
)
# Check for duplicates for control
duplicates <- df_clean %>%
group_by(Subject_ID, Observation_Days_Count, cGVHD_Diagnosis_Day, cGVHD_flag, Blood_test_day, Cell_name) %>%
filter(n() > 1) %>%
ungroup()
# Check the unique cells name
unique_cells_name <- unique(df_clean$Cell_name_unified)
print(unique_cells_name)
## [1] "CD4_TFH" "CD4_TH1" "CD4_TH17"
## [4] "CD4_TH17TO1" "CD4_TH2" "CD4_TH22"
## [7] "CD4+_(IM STAT)" "CD4+_" "CD4+_226+"
## [10] "CD4+_39+" "CD4+_DR+" "CD4+_PD-1+"
## [13] "CD4+_PD-1+TIGIT-" "CD4+_PD-1+TIGIT+" "CD4+_PD-1-TIGIT-"
## [16] "CD4+_PD-1-TIGIT+" "CD4+_TIGIT+" "CD4_NV(STAR2)"
## [19] "CD4_NV" "CD4_NV_TH1" "CD4_NV_TH17"
## [22] "CD4_NV_TH17TO1" "CD4_NV_TH2" "CD4_NV_TH22"
## [25] "CD4_NV+226+" "CD4_NV+39+" "CD4_NV+DR+"
## [28] "CD4_NV+PD-1+" "CD4_NV+PD-1+TIGIT-" "CD4_NV+PD-1+TIGIT+"
## [31] "CD4_NV+PD-1-TIGIT-" "CD4_NV+PD-1-TIGIT+" "CD4_NV+TIGIT+"
## [34] "CD4_ЕМ" "CD4_ЕМ_TH1" "CD4_ЕМ_TH17"
## [37] "CD4_ЕМ_TH17TO1" "CD4_ЕМ_TH2" "CD4_ЕМ_TH22"
## [40] "CD4_ЕМ+226+" "CD4_ЕМ+39+" "CD4_ЕМ+DR+"
## [43] "CD4_ЕМ+PD-1+" "CD4_ЕМ+PD-1+TIGIT-" "CD4_ЕМ+PD-1+TIGIT+"
## [46] "CD4_ЕМ+PD-1-TIGIT-" "CD4_ЕМ+PD-1-TIGIT+" "CD4_ЕМ+TIGIT+"
## [49] "CD4_ЕМTM" "CD4_СМ(STAR2)" "CD4_СМ"
## [52] "CD4_СМ_TH1" "CD4_СМ_TH17" "CD4_СМ_TH17TO1"
## [55] "CD4_СМ_TH2" "CD4_СМ_TH22" "CD4_СМ+226+"
## [58] "CD4_СМ+39+" "CD4_СМ+DR+" "CD4_СМ+PD-1+"
## [61] "CD4_СМ+PD-1+TIGIT-" "CD4_СМ+PD-1+TIGIT+" "CD4_СМ+PD-1-TIGIT-"
## [64] "CD4_СМ+PD-1-TIGIT+" "CD4_СМ+TIGIT+" "CD4_ТM_TH1"
## [67] "CD4_ТM_TH17" "CD4_ТM_TH17TO1" "CD4_ТM_TH2"
## [70] "CD4_ТM_TH22" "CD4_ТREG(STAR2)" "CD4_ТREG"
## [73] "CD4_ТREG_TH1" "CD4_ТREG_TH17" "CD4_ТREG_TH17TO1"
## [76] "CD4_ТREG_TH2" "CD4_ТREG_TH22" "CD4_ТЕ(STAR2)"
## [79] "CD4_ТЕ" "CD4_ТЕ_TH1" "CD4_ТЕ_TH17"
## [82] "CD4_ТЕ_TH17TO1" "CD4_ТЕ_TH2" "CD4_ТЕ_TH22"
## [85] "CD4_ТЕ+226+" "CD4_ТЕ+39+" "CD4_ТЕ+DR+"
## [88] "CD4_ТЕ+PD-1+" "CD4_ТЕ+PD-1+TIGIT-" "CD4_ТЕ+PD-1+TIGIT+"
## [91] "CD4_ТЕ+PD-1-TIGIT-" "CD4_ТЕ+PD-1-TIGIT+" "CD4_ТЕ+TIGIT+"
## [94] "CD4_ТМ" "CD8+_(IM STAT)" "CD8+_"
## [97] "CD8+_226+" "CD8+_39+" "CD8+_DR+"
## [100] "CD8+_PD-1+" "CD8+_PD-1+TIGIT-" "CD8+_PD-1+TIGIT+"
## [103] "CD8+_PD-1-TIGIT-" "CD8+_PD-1-TIGIT+" "CD8+_TIGIT+"
## [106] "CD8_EMTM" "CD8_EMTM+226+" "CD8_EMTM+39+"
## [109] "CD8_EMTM+DR+" "CD8_EMTM+PD-1+" "CD8_EMTM+PD-1+TIGIT-"
## [112] "CD8_EMTM+PD-1+TIGIT+" "CD8_EMTM+PD-1-TIGIT-" "CD8_EMTM+PD-1-TIGIT+"
## [115] "CD8_EMTM+TIGIT+" "CD8_NV(STAR2)" "CD8_NV"
## [118] "CD8_NV+226+" "CD8_NV+39+" "CD8_NV+DR+"
## [121] "CD8_NV+PD-1+" "CD8_NV+PD-1+TIGIT-" "CD8_NV+PD-1+TIGIT+"
## [124] "CD8_NV+PD-1-TIGIT-" "CD8_NV+PD-1-TIGIT+" "CD8_NV+TIGIT+"
## [127] "CD8_TREG" "CD8_ЕМ" "CD8_СМ(STAR2)"
## [130] "CD8_СМ" "CD8_СМ+226+" "CD8_СМ+39+"
## [133] "CD8_СМ+DR+" "CD8_СМ+PD-1+" "CD8_СМ+PD-1+TIGIT-"
## [136] "CD8_СМ+PD-1+TIGIT+" "CD8_СМ+PD-1-TIGIT-" "CD8_СМ+PD-1-TIGIT+"
## [139] "CD8_СМ+TIGIT+" "CD8_ТЕ(STAR2)" "CD8_ТЕ"
## [142] "CD8_ТЕ+226+" "CD8_ТЕ+39+" "CD8_ТЕ+DR+"
## [145] "CD8_ТЕ+PD-1+" "CD8_ТЕ+PD-1+TIGIT-" "CD8_ТЕ+PD-1+TIGIT+"
## [148] "CD8_ТЕ+PD-1-TIGIT-" "CD8_ТЕ+PD-1-TIGIT+" "CD8_ТЕ+TIGIT+"
## [151] "CD8_ТМ" "TREG_CM" "TREG_EMTM"
## [154] "TREG_NV" "TREG_TE" "TREG+226+"
## [157] "TREG+226+TIGIT-" "TREG+226+TIGIT+" "TREG+226-TIGIT-"
## [160] "TREG+226-TIGIT+" "TREG+39+" "TREG+DR+"
## [163] "TREG+PD-1+" "TREG+TIGIT+"
rm(duplicates, replacements, unify_cell_name)
Crucial steps:
# Check the number of unique days
unique_days <- unique(df_clean$Blood_test_day)
print(unique_days)
## [1] "180" "30" "365" "60" "90" "180_cGVHD"
## [7] "30_cGVHD" "60_cGVHD" "90_cGVHD" "cGVHD"
# Transform the dataframe
df_transformed <- df_clean %>%
group_by(Subject_ID, Blood_test_day) %>% # Group by Subject and Day of Blood Test
pivot_wider(
id_cols = c(Subject_ID, Observation_Days_Count, cGVHD_Diagnosis_Day, cGVHD_flag, Blood_test_day), # Columns to retain as is
names_from = Cell_name_unified, # New columns will be created from this column's values
values_from = Cell_Count, # Populate new columns with values from this column
values_fill = list(Cell_Count = NA) # Fill missing values with NA
) %>%
ungroup()
# Check if in any cell contain more than 1 number
multi_value_cells <- apply(df_transformed, 2, function(column) {
any(grepl(",", column, fixed = TRUE) | grepl(" ", column))
})
contains_multiple_values <- ifelse(any(multi_value_cells), "Yes", "No")
print(contains_multiple_values)
## [1] "Yes"
rm(unique_days, contains_multiple_values)
Crucial steps:
df_transformed_without_duplicated <- df_transformed %>%
select(-c(Blood_test_day, Observation_Days_Count, cGVHD_Diagnosis_Day, cGVHD_flag )) %>%
distinct()
# Prepare df_transformed
df_transformed <- df_transformed %>%
mutate(
Subject_ID = as.factor(Subject_ID),
cGVHD_flag = as.factor(cGVHD_flag)
)
# List of descriptive statistics
statistics <- list(
`Number of values` = ~as.character(sum(!is.na(.x))),
`No data` = ~as.character(sum(is.na(.x))),
`Mean` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(mean(.x, na.rm = TRUE) %>% round(2))),
`Median` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(median(.x, na.rm = TRUE) %>% round(2))),
`SD` = ~ifelse(sum(!is.na(.x)) < 3, "NA", as.character(sd(.x, na.rm = TRUE) %>% round(2))),
`min` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(min(.x, na.rm = TRUE) %>% round(2))),
`max` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(max(.x, na.rm = TRUE) %>% round(2)))
)
# Summarize the statistics for each numeric variable and make variables into columns
df_summary <- df_transformed %>%
select(where(is.numeric)) %>%
summarise(across(everything(), statistics, .names = "{.col}__{.fn}")) %>%
pivot_longer(cols = everything(), names_sep = "__", names_to = c("Variable", "Stat"), values_to = "Value") %>%
pivot_wider(names_from = Stat, values_from = Value) %>%
slice(1:25)
# Create a flextable for the summary
flextable(df_summary) %>%
autofit() %>%
border_inner(border = fp_border(color = "black", width = 1)) %>%
border_outer(border = fp_border(color = "black", width = 1))
Variable | Number of values | No data | Mean | Median | SD | min | max | _Number of values | _No data | _Mean | _Median | _SD | _min | _max |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Observation_Days_Count | 231 | 0 | 538.87 | 552 | 140.45 | 155 | 817 | |||||||
cGVHD_Diagnosis_Day | 145 | 86 | 196.94 | 182 | 78.58 | 89 | 502 | |||||||
CD4_TFH | 231 | 0 | 34.21 | 27.37 | 29.43 | 0 | 168.34 | |||||||
CD4_TH1 | 231 | 0 | 31.4 | 20.03 | 48.3 | 0.03 | 608.2 | |||||||
CD4_TH17 | 231 | 0 | 20.01 | 15.2 | 16.5 | 0.02 | 109.23 | |||||||
CD4_TH17TO1 | 231 | 0 | 5.93 | 3.75 | 6.51 | 0.03 | 39.95 | |||||||
CD4_TH2 | 231 | 0 | 20.79 | 14.48 | 22.47 | 0.02 | 143.45 | |||||||
CD4_TH22 | 231 | 0 | 7.42 | 4.91 | 7.82 | 0 | 43.52 | |||||||
CD4+_(IM STAT) | 231 | 0 | 93.49 | 78.4 | 80.61 | 0 | 535.83 | |||||||
CD4+ | 231 | 0 | 110.47 | 91.29 | 91.72 | 0.01 | 561.4 | |||||||
CD4+_226+ | 231 | 0 | 243.48 | 199.65 | 173.16 | 0.31 | 1046.46 | |||||||
CD4+_39+ | 231 | 0 | 58.67 | 47.56 | 56.12 | 0.07 | 379.73 | |||||||
CD4+_DR+ | 231 | 0 | 107.49 | 80.2 | 111.56 | 0.05 | 960.16 | |||||||
CD4+_PD-1+ | 231 | 0 | 146.36 | 116.88 | 118.65 | 0.33 | 789.05 | |||||||
CD4+_PD-1+TIGIT- | 231 | 0 | 79.22 | 64.8 | 62.39 | 0.28 | 335.51 | |||||||
CD4+_PD-1+TIGIT+ | 231 | 0 | 67.14 | 52.3 | 65.54 | 0.05 | 526.23 | |||||||
CD4+_PD-1-TIGIT- | 231 | 0 | 104.29 | 76.47 | 91.53 | 0.03 | 537.94 | |||||||
CD4+_PD-1-TIGIT+ | 231 | 0 | 10.11 | 6.67 | 13.56 | 0 | 154.31 | |||||||
CD4+_TIGIT+ | 231 | 0 | 77.25 | 59.32 | 75.19 | 0.05 | 680.54 | |||||||
CD4_NV(STAR2) | 231 | 0 | 49.66 | 30.32 | 60.85 | 0 | 510.77 | |||||||
CD4_NV | 231 | 0 | 53.69 | 34.57 | 60.86 | 0 | 420.64 | |||||||
CD4_NV_TH1 | 231 | 0 | 3.4 | 1.74 | 5.05 | 0 | 31.18 | |||||||
CD4_NV_TH17 | 231 | 0 | 1.49 | 1.02 | 1.58 | 0 | 8.85 | |||||||
CD4_NV_TH17TO1 | 231 | 0 | 0.32 | 0.15 | 0.42 | 0 | 2.7 | |||||||
CD4_NV_TH2 | 231 | 0 | 6.02 | 3.46 | 8.19 | 0 | 57.11 |
# Clean and summarize the categorical data for all factor variables
df_transformed %>%
select(-Subject_ID) %>%
select(where(is.factor)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
group_by(Variable, Value) %>%
summarise(n = n(), .groups = 'drop') %>%
group_by(Variable) %>%
mutate(`No data` = sum(is.na(Value)),
`% by group` = (n / sum(n)) * 100) %>%
ungroup() %>%
select(Variable, Value, n, `% by group`, `No data`) %>%
arrange(Variable, Value) %>%
flextable() %>%
merge_v("Variable") %>%
autofit() %>%
border_inner(border = fp_border(color = "black", width = 1)) %>%
border_outer(border = fp_border(color = "black", width = 1))
Variable | Value | n | % by group | No data |
|---|---|---|---|---|
cGVHD_flag | 0 | 86 | 37.22944 | 0 |
1 | 145 | 62.77056 | 0 |
rm(df_summary, statistics)
The resulting table is too large and complex for effective understanding due to the number of numeric variables, but it highlights a high proportion of missing data (NA values).
Survival Analysis are planned.
Event: Diagnosis of cGVHD (cGVHD_flag == 1).
Survival Time:
Perform PCA to reduce 167 variables into fewer components that explain the maximum variance!
# Select days of interest
df_cGVHD_prediction <- df_transformed %>%
filter(Blood_test_day %in% c("90", "180", "365", "cGVHD"))
# df_cGVHD_prediction <- df_cGVHD_prediction %>%
# # Apply log transformation to numeric columns
# mutate(
# Observation_Days_Count = factor(Observation_Days_Count),
# cGVHD_Diagnosis_Day = factor(cGVHD_Diagnosis_Day)
# ) %>%
# mutate(across(
# where(is.numeric),
# ~ log10(. + 1),
# .names = "{col}"
# )) %>%
# mutate(
# Observation_Days_Count = as.numeric(Observation_Days_Count),
# cGVHD_Diagnosis_Day = as.numeric(cGVHD_Diagnosis_Day)
# )
Crucial steps:
# Calculate missing values by Blood_test_day
missing_summary <- df_cGVHD_prediction %>%
group_by(Blood_test_day) %>%
summarise(across(everything(), ~ mean(is.na(.)) * 100)) %>%
pivot_longer(cols = -Blood_test_day, names_to = "Column", values_to = "Missing_Percentage")
Crucial steps:
# Select relevant numeric columns
correlation_data <- df_cGVHD_prediction %>%
filter(Blood_test_day %in% c("90")) %>%
select(where(is.numeric))
# Compute correlation matrix
cor_matrix <- cor(correlation_data, use = "complete.obs")
# Mask weak correlations
cor_matrix[abs(cor_matrix) < 0.5] <- NA
# Plot only strong correlations
corrplot(cor_matrix,
method = "color",
type = "lower",
tl.col = "black",
tl.srt = 45,
number.cex = 0.1,
addCoef.col = "black",
mar = c(2, 2, 2, 2),
cl.cex = 0.5,
tl.cex = 0.15)
# Create a table with corellation >= 0.9 as Variable1 variable 2 correlation coefficient
# Correlation matrix into a long format
correlation_table <- as.data.frame(as.table(cor_matrix))
# Rename the columns
colnames(correlation_table) <- c("Variable1", "Variable2", "Correlation")
# Filter for correlations >= 0.9
filtered_table <- correlation_table %>%
filter(Correlation >= 0.9 & Variable1 != Variable2)
# Ensure no duplicate pairs by sorting Variable1 and Variable2
filtered_table <- filtered_table %>%
mutate(
Pair = paste0(pmin(Variable1, Variable2), "-", pmax(Variable1, Variable2))) %>%
distinct(Pair, .keep_all = TRUE) %>%
select(-Pair) %>%
arrange(desc(Correlation)) %>%
mutate(Num = row_number())
print(filtered_table)
## Variable1 Variable2 Correlation Num
## 1 CD4_NV+PD-1-TIGIT- CD4_NV(STAR2) 0.9988338 1
## 2 CD4_ЕМTM CD4_ЕМ+226+ 0.9986997 2
## 3 CD8+_(IM STAT) CD8+_ 0.9980477 3
## 4 CD4_СМ+226+ CD4_СМ(STAR2) 0.9980257 4
## 5 CD4_ТЕ+226+ CD4_ТЕ(STAR2) 0.9978914 5
## 6 CD4_ТЕ(STAR2) CD4_ТЕ+226+ 0.9978914 6
## 7 CD8+_DR+ CD8+_(IM STAT) 0.9969734 7
## 8 CD8+_226+ CD8+_(IM STAT) 0.9967175 8
## 9 CD8_NV+226+ CD8_NV(STAR2) 0.9957302 9
## 10 CD4+_TIGIT+ CD4+_PD-1+TIGIT+ 0.9951726 10
## 11 CD4+_PD-1+TIGIT+ CD4+_TIGIT+ 0.9951726 11
## 12 CD4_NV+226+ CD4_NV(STAR2) 0.9927777 12
## 13 CD4_NV CD4_NV(STAR2) 0.9910998 13
## 14 CD4_NV(STAR2) CD4_NV 0.9910998 14
## 15 CD4_ТЕ+TIGIT+ CD4_ТЕ+PD-1+TIGIT+ 0.9896745 15
## 16 CD8_СМ+226+ CD8_СМ(STAR2) 0.9894197 16
## 17 CD8_СМ(STAR2) CD8_СМ+226+ 0.9894197 17
## 18 CD4_ТM_TH1 CD4_TH1 0.9862180 18
## 19 CD4_TH1 CD4_ТM_TH1 0.9862180 19
## 20 CD4_ТM_TH17TO1 CD4_TH17TO1 0.9849283 20
## 21 CD4_TH17TO1 CD4_ТM_TH17TO1 0.9849283 21
## 22 CD8_NV+PD-1-TIGIT- CD8_NV(STAR2) 0.9847227 22
## 23 CD8_ТЕ+DR+ CD8+_(IM STAT) 0.9824943 23
## 24 CD8+_PD-1-TIGIT+ CD8+_(IM STAT) 0.9775403 24
## 25 CD4_ЕМ+PD-1+TIGIT- CD4+_PD-1+TIGIT- 0.9770290 25
## 26 CD8_EMTM+PD-1+TIGIT+ CD8+_(IM STAT) 0.9755977 26
## 27 CD4_ТM_TH22 CD4_TH22 0.9754262 27
## 28 CD4_TH22 CD4_ТM_TH22 0.9754262 28
## 29 CD8_ТЕ+226+ CD8+_(IM STAT) 0.9754144 29
## 30 CD8_ТЕ(STAR2) CD8+_(IM STAT) 0.9734285 30
## 31 CD8_EMTM+PD-1-TIGIT- CD4_ЕМ_TH1 0.9717046 31
## 32 TREG_CM CD4_ТREG 0.9710022 32
## 33 CD4_ТREG TREG_CM 0.9710022 33
## 34 CD8+_TIGIT+ CD8+_(IM STAT) 0.9705005 34
## 35 CD8_ТЕ+PD-1-TIGIT+ CD8+_(IM STAT) 0.9690826 35
## 36 CD4_ЕМ+DR+ CD4+_DR+ 0.9679522 36
## 37 CD8+_PD-1+ CD8+_(IM STAT) 0.9661445 37
## 38 TREG+226-TIGIT+ CD4_ТREG 0.9654008 38
## 39 TREG+39+ CD4_ТREG 0.9634220 39
## 40 CD8_СМ+TIGIT+ CD8_СМ(STAR2) 0.9627868 40
## 41 TREG+DR+ CD4_ТREG 0.9613748 41
## 42 TREG+TIGIT+ CD4_ТREG 0.9606648 42
## 43 CD8_NV+PD-1+TIGIT+ CD8_NV+PD-1+ 0.9592687 43
## 44 CD8_NV+PD-1+ CD8_NV+PD-1+TIGIT+ 0.9592687 44
## 45 CD8_СМ+DR+ CD8_СМ(STAR2) 0.9575376 45
## 46 CD8_EMTM+39+ CD4_ЕМ_TH1 0.9568416 46
## 47 CD4+_PD-1+ CD4+_DR+ 0.9565507 47
## 48 CD4+_DR+ CD4+_PD-1+ 0.9565507 48
## 49 TREG_EMTM CD4_ТREG 0.9564104 49
## 50 TREG+PD-1+ CD4_ТREG 0.9559641 50
## 51 CD4_ЕМ+39+ CD4+_39+ 0.9540809 51
## 52 CD4+_39+ CD4_ЕМ+39+ 0.9540809 52
## 53 CD8_EMTM+PD-1+TIGIT- CD4_ЕМ_TH1 0.9529874 53
## 54 CD8+_PD-1+TIGIT- CD8+_(IM STAT) 0.9528599 54
## 55 CD8_ТЕ CD8+_(IM STAT) 0.9497343 55
## 56 CD4_ТМ CD4_ЕМ+226+ 0.9494617 56
## 57 CD8_NV+PD-1-TIGIT+ CD8_NV+TIGIT+ 0.9481596 57
## 58 CD8_EMTM+226+ CD4_ЕМ_TH1 0.9462705 58
## 59 CD4_СМ+TIGIT+ CD4_СМ+PD-1+ 0.9456306 59
## 60 CD8_EMTM+DR+ CD4_ЕМ_TH1 0.9448519 60
## 61 CD8_ЕМ CD4_ЕМ_TH1 0.9428652 61
## 62 CD4_ТЕ+PD-1+TIGIT- CD4_TH1 0.9422821 62
## 63 CD8+_PD-1-TIGIT- CD4_ЕМ_TH1 0.9418677 63
## 64 CD8_NV+TIGIT+ CD8_NV+PD-1+ 0.9410303 64
## 65 CD8_ТЕ+PD-1+TIGIT- CD8+_PD-1+TIGIT- 0.9408341 65
## 66 CD8_EMTM CD4_ЕМ_TH1 0.9400801 66
## 67 CD4_NV+PD-1+TIGIT- CD4_NV+PD-1+ 0.9396690 67
## 68 CD4_NV+PD-1+ CD4_NV+PD-1+TIGIT- 0.9396690 68
## 69 CD4+_ CD4+_(IM STAT) 0.9382052 69
## 70 CD4+_(IM STAT) CD4+_ 0.9382052 70
## 71 CD4_СМ CD4_СМ(STAR2) 0.9374218 71
## 72 CD4_СМ(STAR2) CD4_СМ 0.9374218 72
## 73 CD8_ТМ CD8+_(IM STAT) 0.9362897 73
## 74 CD8_EMTM+PD-1-TIGIT+ CD4_ЕМ_TH1 0.9360448 74
## 75 CD4_ТREG(STAR2) CD4+_(IM STAT) 0.9354179 75
## 76 CD8_ТЕ+PD-1-TIGIT- CD4_ЕМ_TH1 0.9350791 76
## 77 CD8_ТЕ+PD-1+TIGIT+ CD8+_PD-1+ 0.9344868 77
## 78 CD8_ТЕ+TIGIT+ CD8+_(IM STAT) 0.9344091 78
## 79 CD8_СМ+39+ CD8_СМ(STAR2) 0.9337067 79
## 80 TREG+226+TIGIT+ CD4_ТREG 0.9336296 80
## 81 CD4_ТREG_TH2 TREG_EMTM 0.9329874 81
## 82 CD8+_PD-1+TIGIT+ CD8+_(IM STAT) 0.9310397 82
## 83 CD4_СМ_TH17 CD4_TH17 0.9275637 83
## 84 CD4_TH17 CD4_СМ_TH17 0.9275637 84
## 85 CD4_ЕМ+PD-1+TIGIT+ CD4+_PD-1+TIGIT+ 0.9258802 85
## 86 CD8_ТЕ+39+ CD4_ТЕ+39+ 0.9243269 86
## 87 CD4_ЕМ+PD-1+ CD4+_DR+ 0.9225987 87
## 88 CD4_ТREG_TH17 TREG+226+ 0.9210647 88
## 89 CD8_EMTM+PD-1+ CD4_ЕМ_TH1 0.9182152 89
## 90 CD4_ЕМ+TIGIT+ CD4+_PD-1+TIGIT+ 0.9163401 90
## 91 CD8_СМ+PD-1+ CD8_СМ(STAR2) 0.9155320 91
## 92 CD8_NV+DR+ CD8+_PD-1-TIGIT+ 0.9150977 92
## 93 CD8_СМ CD4_ЕМ_TH1 0.9143635 93
## 94 CD8_TREG CD4_ЕМ_TH1 0.9143576 94
## 95 CD4_ТЕ+DR+ CD4_TH1 0.9142354 95
## 96 CD8+_ CD4_ЕМ_TH1 0.9141853 96
## 97 CD4_ТЕ CD4_ЕМ 0.9109908 97
## 98 CD4_ЕМ CD4_ТЕ 0.9109908 98
## 99 CD8+_39+ CD4_ЕМ_TH1 0.9103280 99
## 100 CD4_ЕМ+226+ CD4_ЕМ+DR+ 0.9096396 100
## 101 CD4_СМ+PD-1+ CD4_СМ+DR+ 0.9084474 101
## 102 CD4_СМ+DR+ CD4_СМ+PD-1+ 0.9084474 102
## 103 CD4_СМ+PD-1+TIGIT+ CD4_СМ+DR+ 0.9080798 103
## 104 CD4_ТЕ+39+ CD4_ТЕ+DR+ 0.9076214 104
## 105 CD4_ТЕ+PD-1+TIGIT+ CD4_ТЕ+PD-1+ 0.9071497 105
## 106 CD4_NV+TIGIT+ CD4_NV+PD-1-TIGIT+ 0.9063595 106
## 107 CD4_NV+PD-1-TIGIT+ CD4_NV+TIGIT+ 0.9063595 107
## 108 TREG+226+ CD4_ТREG 0.9059511 108
## 109 CD8_ТЕ+PD-1+ CD4_ТЕ+PD-1+TIGIT+ 0.9045578 109
## 110 CD4_ТЕ+PD-1+ CD4+_DR+ 0.9042370 110
## 111 CD8_СМ+PD-1+TIGIT+ CD8_СМ(STAR2) 0.9031949 111
## 112 CD4_ТЕ_TH1 CD4_ЕМ_TH1 0.9030272 112
## 113 CD4_ЕМ_TH1 CD4_ТЕ_TH1 0.9030272 113
## 114 CD4+_PD-1+TIGIT- CD4+_DR+ 0.9027076 114
## 115 CD8_NV CD8_NV(STAR2) 0.9014977 115
## 116 CD8_NV(STAR2) CD8_NV 0.9014977 116
## 117 CD8_EMTM+TIGIT+ CD4_ЕМ_TH1 0.9006824 117
## 118 CD8_СМ+PD-1+TIGIT- CD8_СМ+226+ 0.9003490 118
# Check correlations for 'cGVHD_Diagnosis_Day' greater than 0.6
cgvhd_filtered_table <- correlation_table %>%
filter(
(Variable1 == "cGVHD_Diagnosis_Day") &
Correlation > 0.6 &
(Variable1 != Variable2)
) %>%
arrange(desc(Correlation)) %>%
mutate(Num = row_number())
# Print the filtered table
print(cgvhd_filtered_table)
## Variable1 Variable2 Correlation Num
## 1 cGVHD_Diagnosis_Day CD4_ТREG 0.7115962 1
## 2 cGVHD_Diagnosis_Day TREG_CM 0.6923027 2
## 3 cGVHD_Diagnosis_Day TREG+226-TIGIT+ 0.6656244 3
## 4 cGVHD_Diagnosis_Day CD4_ТЕ_TH17 0.6649963 4
## 5 cGVHD_Diagnosis_Day TREG+39+ 0.6364174 5
## 6 cGVHD_Diagnosis_Day TREG_EMTM 0.6356658 6
## 7 cGVHD_Diagnosis_Day TREG+TIGIT+ 0.6186489 7
## 8 cGVHD_Diagnosis_Day TREG+DR+ 0.6134311 8
## 9 cGVHD_Diagnosis_Day TREG+PD-1+ 0.6018802 9
Models that are not sensitive to multicollinearity because they split the data hierarchically:
# Select relevant numeric columns
correlation_data <- df_cGVHD_prediction %>%
filter(Blood_test_day %in% c("90")) %>%
select(where(is.numeric), -cGVHD_Diagnosis_Day)
# Compute correlation matrix
cor_matrix <- cor(correlation_data, use = "complete.obs")
# Hierarchical clustering and heatmap
pheatmap(cor_matrix,
cluster_rows = TRUE,
cluster_cols = TRUE,
display_numbers = FALSE,
fontsize_row = 2,
fontsize_col = 2,
main = "Clustered Heatmap for Survival Variables",
width = 20,
height = 20 )
# Perform hierarchical clustering
# Convert correlation to distance
distance_matrix <- as.dist(1 - cor_matrix)
hclust_result <- hclust(distance_matrix, method = "ward.D2") # Ward's method
# Cut the dendrogram into clusters (replace 5 with your desired number of clusters)
clusters <- cutree(hclust_result, k = 6) # Extract 5 clusters
# Create a data frame to associate variables with their clusters
clustered_data <- data.frame(
Variable = rownames(cor_matrix),
Cluster = clusters
)
# Dendrogram with clusters
plot(hclust_result, labels = rownames(cor_matrix), main = "Dendrogram of Clusters", cex = 0.2)
# Add rectangles to highlight clusters
rect.hclust(hclust_result, k = 5, border = 2:6)
custom_order <- c("90", "180", "365", "cGVHD")
# Convert Blood_test_day to a factor with the custom order
df_cGVHD_transformed <- df_cGVHD_prediction %>%
mutate(Blood_test_day = factor(Blood_test_day, levels = custom_order))
# Arrange data by the specified order of Blood_test_day
df_cGVHD_transformed <- df_cGVHD_transformed %>%
arrange(Blood_test_day)
# Aggregate data by Blood_test_day
heatmap_data <- df_cGVHD_transformed %>%
group_by(Blood_test_day) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
ungroup()
# Convert to matrix for heatmap plotting
heatmap_matrix <- heatmap_data %>%
column_to_rownames("Blood_test_day") %>%
as.matrix()
# Create the heatmap
pheatmap(heatmap_matrix,
cluster_rows = FALSE,
cluster_cols = TRUE,
display_numbers = FALSE,
fontsize_row = 6,
fontsize_col = 3,
main = "Heatmap: Variable Distribution by Blood Test Day")
Some variable distributions seem to change over time (Blood Test Day), with certain groups (e.g., those in warmer tones) showing higher activity at specific days, such as 365 (for non-log-transformed data).
# Filter and normalize data while preserving cGVHD_flag
df_normalized <- df_transformed %>%
filter(Blood_test_day %in% c("90", "180", "365", "cGVHD")) %>%
# Ensure cGVHD_flag is retained for later use
select(cGVHD_flag, where(is.numeric), -Observation_Days_Count, -cGVHD_Diagnosis_Day) %>%
mutate(across(where(is.numeric), ~ (.-mean(.)) / sd(.), .names = "z_{col}"))
# Save cGVHD_flag for later use
cGVHD_flags <- df_normalized$cGVHD_flag
# Perform near-zero variance filtering (excluding cGVHD_flag)
nzv <- nearZeroVar(df_normalized %>% select(-cGVHD_flag))
df_filtered <- df_normalized %>%
select(-nzv, cGVHD_flag) %>% # Retain cGVHD_flag after filtering
filter(complete.cases(.))
# Perform PCA (exclude cGVHD_flag from PCA computation)
pca_results <- prcomp(df_filtered %>% select(-cGVHD_flag), scale. = TRUE)
# Attach PCA scores and cGVHD_flag back together
pca_scores <- as.data.frame(pca_results$x) %>%
mutate(cGVHD_flag = cGVHD_flags)
# Explained variance calculations
explained_variance <- pca_results$sdev^2 / sum(pca_results$sdev^2)
cumulative_variance <- cumsum(explained_variance)
# Create a data frame for plotting explained variance
explained_variance_data <- data.frame(
PC = 1:length(explained_variance),
Explained_Variance = explained_variance,
Cumulative_Variance = cumulative_variance
)
# Filter for components contributing to the first 80% of cumulative variance
explained_variance_data_filtered <- explained_variance_data %>%
filter(Cumulative_Variance <= 0.8)
# Add percentage labels for explained variance
explained_variance_data_filtered <- explained_variance_data_filtered %>%
mutate(Variance_Percentage_Label = paste0(round(Explained_Variance * 100, 2), "%"))
# Plot explained variance with percentage labels
ggplot(explained_variance_data_filtered, aes(x = PC)) +
geom_bar(aes(y = Explained_Variance), stat = "identity", fill = "steelblue") +
geom_text(aes(y = Explained_Variance, label = Variance_Percentage_Label),
vjust = -0.5, size = 3.5) +
geom_line(aes(y = Cumulative_Variance), color = "red", size = 1) +
geom_point(aes(y = Cumulative_Variance), color = "red", size = 2) +
scale_y_continuous(
name = "Variance Explained",
sec.axis = sec_axis(~., name = "Cumulative Variance Explained")
) +
labs(
title = "Explained Variance by Principal Components (First 80%)",
x = "Principal Component"
) +
theme_minimal(base_size = 14)
# Perform clustering on PCA scores
set.seed(123)
wss <- sapply(1:10, function(k) {
kmeans(pca_scores[, 1:10], centers = k, nstart = 25)$tot.withinss
})
# Plot Elbow Method
elbow_plot <- data.frame(Clusters = 1:10, WSS = wss)
ggplot(elbow_plot, aes(x = Clusters, y = WSS)) +
geom_line() +
geom_point(size = 3) +
labs(
title = "Elbow Method for Optimal Clusters",
x = "Number of Clusters",
y = "Total Within-Cluster Sum of Squares (WSS)"
) +
theme_minimal(base_size = 14)
# Apply K-means clustering
optimal_clusters <- 4
kmeans_result <- kmeans(pca_scores[, 1:10], centers = optimal_clusters, nstart = 25)
pca_scores$Cluster <- as.factor(kmeans_result$cluster)
# Visualize clusters with coloring by cGVHD_flag
ggplot(pca_scores, aes(x = PC1, y = PC2, color = as.factor(cGVHD_flag))) +
geom_point(size = 2, alpha = 0.8) + # Scatterplot of PCA points
stat_ellipse(aes(group = Cluster), type = "t", linetype = "dashed", size = 1, color = "black") + # Ellipses for clusters
scale_color_brewer(palette = "Set1") + # Color palette for cGVHD_flag
labs(
title = "PCA Clusters with cGVHD_flag Coloring and Cluster Ellipses",
x = "PC1 (Principal Component 1)",
y = "PC2 (Principal Component 2)",
color = "cGVHD_flag"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold", hjust = 0.5)
)
## Too few points to calculate an ellipse
# Load Boruta package
library(Boruta)
df_normalized <- df_transformed %>%
filter(Blood_test_day %in% c("90", "180", "365", "cGVHD")) %>%
select(where(is.numeric), -Observation_Days_Count, -cGVHD_Diagnosis_Day)
nzv <- nearZeroVar(df_normalized)
df_filtered <- df_normalized[, -nzv] # near-zero variance filtering
df_filtered <- df_normalized %>%
filter(complete.cases(.)) # Delete NA
df_filtered$cGVHD_flag <- as.factor(df_cGVHD_prediction$cGVHD_flag)
# Apply the Boruta algorithm for feature selection
boruta_result <- Boruta(cGVHD_flag ~ ., data = df_filtered)
final_decision <- attStats(boruta_result)
print(final_decision)
## meanImp medianImp minImp maxImp
## CD4_TFH 0.275937126 0.363346656 -1.373304148 2.2045750
## CD4_TH1 0.371346040 0.505538752 -0.597288351 1.7062409
## CD4_TH17 0.954583268 0.911847089 -0.274445066 2.3211816
## CD4_TH17TO1 0.318146704 0.300188508 -1.038701171 2.0288875
## CD4_TH2 0.964668073 1.043783003 -0.959916111 2.5666241
## CD4_TH22 3.078155557 3.089281887 -0.893501999 5.3328783
## CD4+_(IM STAT) 5.399603582 5.670148763 1.856439134 7.4640394
## CD4+_ 4.323622392 4.526130957 0.880961813 6.6238344
## CD4+_226+ 3.260640034 3.487806941 -0.224825581 5.7043893
## CD4+_39+ 0.889326587 1.317629337 -2.155066507 2.5682610
## CD4+_DR+ 0.401756149 0.380293275 -1.468688699 1.9918155
## CD4+_PD-1+ 1.424442622 1.622104770 -0.288787886 2.8361988
## CD4+_PD-1+TIGIT- 0.765410783 0.720661346 -0.806574946 2.1343221
## CD4+_PD-1+TIGIT+ 0.639586213 0.510466926 -0.999191981 1.9270155
## CD4+_PD-1-TIGIT- 0.170480823 0.235217125 -1.326237118 2.1653324
## CD4+_PD-1-TIGIT+ 0.341183177 0.376714784 -1.157962320 1.9185238
## CD4+_TIGIT+ 0.965902316 0.910284371 -0.783330140 2.0542812
## CD4_NV(STAR2) 1.048050974 1.392850096 -1.535950567 2.6391562
## CD4_NV 1.055530395 1.159838566 -0.767799105 2.4424385
## CD4_NV_TH1 1.460189943 1.433241451 -0.699308415 4.1650816
## CD4_NV_TH17 -0.456175317 -0.101836264 -2.536597757 0.7835719
## CD4_NV_TH17TO1 1.176414946 1.021924886 0.192406437 2.3896965
## CD4_NV_TH2 0.305547257 0.162967488 -1.007725134 2.4910715
## CD4_NV_TH22 0.524241445 0.542678857 -0.896401103 1.8755963
## CD4_NV+226+ 1.186467866 1.421094029 -0.814809588 2.7325232
## CD4_NV+39+ 0.816701613 1.008122887 -1.503421339 2.4853659
## CD4_NV+DR+ -0.303138152 -0.459627699 -1.583938505 2.0959799
## CD4_NV+PD-1+ 1.211566047 1.105250735 -1.401405853 3.2577951
## CD4_NV+PD-1+TIGIT- 0.408598443 0.415829048 -1.529148202 1.9143535
## CD4_NV+PD-1+TIGIT+ 4.614293992 4.875804653 1.558003660 6.4816540
## CD4_NV+PD-1-TIGIT- 1.154487236 1.218659233 -0.164363022 2.4240605
## CD4_NV+PD-1-TIGIT+ 0.957961906 1.186988904 -1.177357774 2.4579466
## CD4_NV+TIGIT+ 3.206585735 3.395910125 -0.086492946 5.3710123
## CD4_ЕМ 1.294249975 1.443855539 -0.111568456 3.0580621
## CD4_ЕМ_TH1 4.425039085 4.522884870 0.715454936 6.7525340
## CD4_ЕМ_TH17 0.862665350 0.678974214 -0.051625343 2.1812339
## CD4_ЕМ_TH17TO1 7.277045876 7.590777300 2.817118145 9.7254706
## CD4_ЕМ_TH2 0.761870675 0.700256440 -0.564301649 2.3565401
## CD4_ЕМ_TH22 0.311942033 0.182454742 -1.584506954 1.8604464
## CD4_ЕМ+226+ 1.543462670 1.587407351 -0.232483248 2.8254162
## CD4_ЕМ+39+ 0.490873221 0.553009928 -2.322914198 2.1131446
## CD4_ЕМ+DR+ 0.763199435 0.795206599 -0.242309562 1.8435584
## CD4_ЕМ+PD-1+ 1.108661312 1.045366721 -0.587789935 2.1740283
## CD4_ЕМ+PD-1+TIGIT- 0.753533613 0.523980025 -0.594501837 2.2679027
## CD4_ЕМ+PD-1+TIGIT+ 0.234784280 0.059973340 -0.926551321 1.7468386
## CD4_ЕМ+PD-1-TIGIT- 0.320463808 0.202897662 -1.585831542 1.9067665
## CD4_ЕМ+PD-1-TIGIT+ -0.171482584 -0.004365224 -2.487076483 1.1934384
## CD4_ЕМ+TIGIT+ 0.313357674 0.206748881 -0.790478815 1.6519178
## CD4_ЕМTM 0.972442004 0.983866929 -0.693934252 2.5688909
## CD4_СМ(STAR2) 0.495014014 0.692458516 -1.545233541 2.0400186
## CD4_СМ 1.241441723 1.196864479 -0.080049060 2.3574531
## CD4_СМ_TH1 3.036100840 3.075430409 0.713682863 5.1344505
## CD4_СМ_TH17 2.684795946 2.749483043 -0.652442251 4.5181968
## CD4_СМ_TH17TO1 1.122030943 1.181403891 -0.164238168 2.2762473
## CD4_СМ_TH2 4.195865506 4.300009084 1.835914028 6.4573984
## CD4_СМ_TH22 6.324460886 6.578230411 2.902173827 8.4741417
## CD4_СМ+226+ 0.611386315 0.526815767 -0.585970381 2.0100202
## CD4_СМ+39+ 0.446946101 0.220428622 -1.709526057 1.9717870
## CD4_СМ+DR+ 0.230021370 0.386396612 -2.035978470 1.8778436
## CD4_СМ+PD-1+ -0.073391310 0.066570761 -2.418947140 1.8099610
## CD4_СМ+PD-1+TIGIT- 0.114440809 0.067156146 -1.979122016 1.3230956
## CD4_СМ+PD-1+TIGIT+ 0.306397669 0.440762455 -1.359269499 1.5219055
## CD4_СМ+PD-1-TIGIT- 1.629357911 1.767858821 0.419367421 2.5323225
## CD4_СМ+PD-1-TIGIT+ -0.339016155 -0.599559338 -1.976389362 1.8409060
## CD4_СМ+TIGIT+ 1.285639667 1.153823947 0.175817586 2.6885042
## CD4_ТM_TH1 0.285572512 0.464849558 -1.259993702 1.2872835
## CD4_ТM_TH17 0.292847496 0.304753088 -1.688745028 1.8010805
## CD4_ТM_TH17TO1 0.186263363 0.289384980 -1.914947952 2.0064351
## CD4_ТM_TH2 1.532194388 1.787953944 -1.806899825 3.7786427
## CD4_ТM_TH22 1.514777463 1.660335982 -0.504604612 3.1377345
## CD4_ТREG(STAR2) 5.460464061 5.689751384 1.756057874 7.3049698
## CD4_ТREG 1.265972506 1.221614144 -0.348575251 3.0156694
## CD4_ТREG_TH1 0.247627280 0.287753203 -1.589943994 2.2532309
## CD4_ТREG_TH17 3.022393839 3.122630236 -0.278669754 5.0950106
## CD4_ТREG_TH17TO1 0.255921491 0.377248439 -1.322269270 1.9335020
## CD4_ТREG_TH2 1.105196155 1.123717180 -0.933057646 3.2852182
## CD4_ТREG_TH22 0.987104619 1.005433096 -0.220064145 2.0204995
## CD4_ТЕ(STAR2) -0.009239855 -0.046758290 -1.634244762 2.1583041
## CD4_ТЕ 0.081877250 0.083742266 -1.402301564 0.9325371
## CD4_ТЕ_TH1 0.963556997 0.956837277 -1.484088657 3.0604074
## CD4_ТЕ_TH17 -0.142582286 -0.222013362 -1.145531133 1.5803324
## CD4_ТЕ_TH17TO1 0.357180911 0.209989219 -0.819992371 1.5535715
## CD4_ТЕ_TH2 -0.037146397 -0.121007715 -1.576568762 1.9996748
## CD4_ТЕ_TH22 -0.378647968 -0.391869623 -1.910511822 1.6400068
## CD4_ТЕ+226+ -0.084977465 0.029119843 -1.779651338 1.9701350
## CD4_ТЕ+39+ 0.225154179 0.242366746 -1.393640860 2.5706828
## CD4_ТЕ+DR+ 0.338128404 0.280995104 -1.423633571 2.1610768
## CD4_ТЕ+PD-1+ -0.337577700 -0.212831518 -1.902984590 1.3309814
## CD4_ТЕ+PD-1+TIGIT- -0.086482143 0.101183134 -1.440889287 1.4999073
## CD4_ТЕ+PD-1+TIGIT+ -0.253567867 -0.432656154 -2.025138072 1.4108459
## CD4_ТЕ+PD-1-TIGIT- 0.446631026 0.327430980 -0.864384179 1.7955601
## CD4_ТЕ+PD-1-TIGIT+ 0.477131422 0.479091653 -0.880232336 2.2052402
## CD4_ТЕ+TIGIT+ -0.208655903 -0.492259251 -2.270875739 1.8804063
## CD4_ТМ 3.118855993 3.289358142 -0.604273718 6.1176763
## CD8+_(IM STAT) 1.245543351 1.134891343 -0.502711411 2.9189639
## CD8+_ 4.006916369 4.210659672 0.841928294 6.5756355
## CD8+_226+ 2.483765013 2.428979471 -0.808693052 5.2130788
## CD8+_39+ -0.379063321 -0.155133723 -2.535763344 1.0747775
## CD8+_DR+ 1.988255602 2.097422946 -0.793886512 3.8046116
## CD8+_PD-1+ 0.869200605 1.123658801 -1.585076394 2.9229171
## CD8+_PD-1+TIGIT- 0.590896132 0.820991236 -1.915853491 2.7986601
## CD8+_PD-1+TIGIT+ 1.510523501 1.777858532 -0.932663694 2.6355907
## CD8+_PD-1-TIGIT- 0.316711217 0.470918180 -1.569133034 1.8785878
## CD8+_PD-1-TIGIT+ 1.075033040 1.120514102 -1.010299501 2.6145925
## CD8+_TIGIT+ 0.793815895 0.639361358 -1.294132354 2.0755349
## CD8_EMTM 3.511573033 3.590030415 0.957769869 6.8932173
## CD8_EMTM+226+ 3.024237797 2.993001452 0.310661511 6.7024385
## CD8_EMTM+39+ 0.466648592 0.474180799 -0.531980283 1.9855748
## CD8_EMTM+DR+ 4.300386653 4.261313283 1.581026844 6.9364673
## CD8_EMTM+PD-1+ 3.775876235 3.749905232 0.902595568 6.1062808
## CD8_EMTM+PD-1+TIGIT- 0.380158011 0.422706828 -2.296118333 1.7327369
## CD8_EMTM+PD-1+TIGIT+ 3.589005859 3.582403241 0.368144410 6.5470238
## CD8_EMTM+PD-1-TIGIT- 1.928907557 1.862232666 -0.108248889 4.3267920
## CD8_EMTM+PD-1-TIGIT+ 3.104211759 3.256385791 -0.230726558 5.0558569
## CD8_EMTM+TIGIT+ 4.114975055 4.155144619 0.095649822 6.3890315
## CD8_NV(STAR2) 0.872511346 0.709799261 -0.932825493 3.3741478
## CD8_NV 0.793118171 0.542076392 -0.511067939 2.1592329
## CD8_NV+226+ 0.613828241 0.937545955 -1.569659968 3.5044567
## CD8_NV+39+ 0.478580415 0.579555459 -1.699584701 2.9321011
## CD8_NV+DR+ 0.606439175 0.907015036 -0.526150552 1.6725525
## CD8_NV+PD-1+ 0.931116648 0.914617239 -0.658030101 2.6969884
## CD8_NV+PD-1+TIGIT- -0.077075230 -0.222199100 -1.509486405 2.1573345
## CD8_NV+PD-1+TIGIT+ 0.519759990 0.445458950 -1.338224173 2.0992661
## CD8_NV+PD-1-TIGIT- 0.138326813 0.083642446 -1.274857745 1.6288564
## CD8_NV+PD-1-TIGIT+ -0.764273454 -0.658517969 -2.503523458 0.8065249
## CD8_NV+TIGIT+ -0.625253732 -0.600982393 -2.182651735 1.3664447
## CD8_TREG 4.573143874 4.713150088 1.264727338 7.1971468
## CD8_ЕМ 0.401006572 0.456698286 -1.604618625 2.5387477
## CD8_СМ(STAR2) 2.229309764 2.348587868 -0.774728333 4.1125219
## CD8_СМ 0.476739113 0.382370660 -0.855857856 2.0206210
## CD8_СМ+226+ 0.992989548 0.930151474 -1.307638320 2.4558743
## CD8_СМ+39+ 0.286903328 0.097155735 -1.778007951 2.1548406
## CD8_СМ+DR+ 0.958350232 1.185277829 -0.218306476 2.6159870
## CD8_СМ+PD-1+ 0.483315447 0.344323232 -1.613799766 2.3051279
## CD8_СМ+PD-1+TIGIT- 0.232948932 0.307150442 -1.165483388 1.2196431
## CD8_СМ+PD-1+TIGIT+ 0.390676863 0.478643468 -1.076552015 1.4154909
## CD8_СМ+PD-1-TIGIT- -0.132923436 -0.383993158 -1.087359733 1.1442626
## CD8_СМ+PD-1-TIGIT+ 0.170795339 0.331203802 -2.146338441 1.4848629
## CD8_СМ+TIGIT+ 1.262250296 1.219404311 -0.670622528 2.9762162
## CD8_ТЕ(STAR2) 1.140118583 1.189347883 -0.765343890 2.4204819
## CD8_ТЕ 2.744561912 2.805599848 -0.055792162 4.6806426
## CD8_ТЕ+226+ 0.756570542 0.788621745 -0.635056509 2.4760316
## CD8_ТЕ+39+ -0.491470208 -0.586552278 -1.652677715 0.6791737
## CD8_ТЕ+DR+ 0.890659976 0.491023615 -0.541953142 2.8469106
## CD8_ТЕ+PD-1+ 0.220095471 0.020839703 -1.105064261 2.1362759
## CD8_ТЕ+PD-1+TIGIT- 0.087018691 0.022795045 -1.880146612 1.9684677
## CD8_ТЕ+PD-1+TIGIT+ 0.435369019 0.580735233 -1.893484410 1.7130266
## CD8_ТЕ+PD-1-TIGIT- 0.899762325 0.733538690 -0.448767627 2.4188596
## CD8_ТЕ+PD-1-TIGIT+ 1.067246563 1.168130080 -0.863593005 1.9096426
## CD8_ТЕ+TIGIT+ 1.372222386 1.177453128 -1.079961512 2.9142020
## CD8_ТМ 1.234091031 1.110640280 -0.027192845 2.5175713
## TREG_CM 0.191214788 0.211638335 -1.091895070 1.8863218
## TREG_EMTM 2.439858040 2.442110955 -0.114936504 5.3648939
## TREG_NV 0.356283485 0.582730263 -1.311403623 1.4648789
## TREG_TE 4.246379656 4.282526372 1.885810026 6.6914198
## TREG+226+ 1.442358303 1.751763113 -0.674603118 2.7704962
## TREG+226+TIGIT- -0.015635124 0.006790233 -1.558522544 1.6397693
## TREG+226+TIGIT+ 1.715041587 1.766836755 -0.002211773 3.9427960
## TREG+226-TIGIT- 0.755989274 0.638027510 -0.246772753 2.0734027
## TREG+226-TIGIT+ 0.829909315 0.867239697 -0.497418970 2.1099737
## TREG+39+ 0.847837242 0.865698782 -1.164731512 1.9398886
## TREG+DR+ 1.658350976 1.583106645 0.032533031 3.8202896
## TREG+PD-1+ 0.821897718 0.672039469 -1.858199092 2.9088199
## TREG+TIGIT+ 2.957800390 3.015045454 0.184339334 5.3598859
## normHits decision
## CD4_TFH 0.01010101 Rejected
## CD4_TH1 0.00000000 Rejected
## CD4_TH17 0.00000000 Rejected
## CD4_TH17TO1 0.00000000 Rejected
## CD4_TH2 0.00000000 Rejected
## CD4_TH22 0.47474747 Tentative
## CD4+_(IM STAT) 0.88888889 Confirmed
## CD4+_ 0.78787879 Confirmed
## CD4+_226+ 0.53535354 Tentative
## CD4+_39+ 0.00000000 Rejected
## CD4+_DR+ 0.00000000 Rejected
## CD4+_PD-1+ 0.01010101 Rejected
## CD4+_PD-1+TIGIT- 0.00000000 Rejected
## CD4+_PD-1+TIGIT+ 0.00000000 Rejected
## CD4+_PD-1-TIGIT- 0.00000000 Rejected
## CD4+_PD-1-TIGIT+ 0.00000000 Rejected
## CD4+_TIGIT+ 0.00000000 Rejected
## CD4_NV(STAR2) 0.01010101 Rejected
## CD4_NV 0.00000000 Rejected
## CD4_NV_TH1 0.06060606 Rejected
## CD4_NV_TH17 0.00000000 Rejected
## CD4_NV_TH17TO1 0.00000000 Rejected
## CD4_NV_TH2 0.00000000 Rejected
## CD4_NV_TH22 0.00000000 Rejected
## CD4_NV+226+ 0.00000000 Rejected
## CD4_NV+39+ 0.00000000 Rejected
## CD4_NV+DR+ 0.00000000 Rejected
## CD4_NV+PD-1+ 0.03030303 Rejected
## CD4_NV+PD-1+TIGIT- 0.00000000 Rejected
## CD4_NV+PD-1+TIGIT+ 0.83838384 Confirmed
## CD4_NV+PD-1-TIGIT- 0.00000000 Rejected
## CD4_NV+PD-1-TIGIT+ 0.00000000 Rejected
## CD4_NV+TIGIT+ 0.50505051 Tentative
## CD4_ЕМ 0.00000000 Rejected
## CD4_ЕМ_TH1 0.77777778 Confirmed
## CD4_ЕМ_TH17 0.00000000 Rejected
## CD4_ЕМ_TH17TO1 0.98989899 Confirmed
## CD4_ЕМ_TH2 0.00000000 Rejected
## CD4_ЕМ_TH22 0.00000000 Rejected
## CD4_ЕМ+226+ 0.01010101 Rejected
## CD4_ЕМ+39+ 0.00000000 Rejected
## CD4_ЕМ+DR+ 0.00000000 Rejected
## CD4_ЕМ+PD-1+ 0.00000000 Rejected
## CD4_ЕМ+PD-1+TIGIT- 0.00000000 Rejected
## CD4_ЕМ+PD-1+TIGIT+ 0.00000000 Rejected
## CD4_ЕМ+PD-1-TIGIT- 0.00000000 Rejected
## CD4_ЕМ+PD-1-TIGIT+ 0.00000000 Rejected
## CD4_ЕМ+TIGIT+ 0.00000000 Rejected
## CD4_ЕМTM 0.00000000 Rejected
## CD4_СМ(STAR2) 0.00000000 Rejected
## CD4_СМ 0.00000000 Rejected
## CD4_СМ_TH1 0.49494949 Tentative
## CD4_СМ_TH17 0.36363636 Tentative
## CD4_СМ_TH17TO1 0.00000000 Rejected
## CD4_СМ_TH2 0.73737374 Confirmed
## CD4_СМ_TH22 0.95959596 Confirmed
## CD4_СМ+226+ 0.00000000 Rejected
## CD4_СМ+39+ 0.00000000 Rejected
## CD4_СМ+DR+ 0.00000000 Rejected
## CD4_СМ+PD-1+ 0.00000000 Rejected
## CD4_СМ+PD-1+TIGIT- 0.00000000 Rejected
## CD4_СМ+PD-1+TIGIT+ 0.00000000 Rejected
## CD4_СМ+PD-1-TIGIT- 0.00000000 Rejected
## CD4_СМ+PD-1-TIGIT+ 0.00000000 Rejected
## CD4_СМ+TIGIT+ 0.01010101 Rejected
## CD4_ТM_TH1 0.00000000 Rejected
## CD4_ТM_TH17 0.00000000 Rejected
## CD4_ТM_TH17TO1 0.00000000 Rejected
## CD4_ТM_TH2 0.02020202 Rejected
## CD4_ТM_TH22 0.02020202 Rejected
## CD4_ТREG(STAR2) 0.89898990 Confirmed
## CD4_ТREG 0.02020202 Rejected
## CD4_ТREG_TH1 0.00000000 Rejected
## CD4_ТREG_TH17 0.49494949 Tentative
## CD4_ТREG_TH17TO1 0.00000000 Rejected
## CD4_ТREG_TH2 0.02020202 Rejected
## CD4_ТREG_TH22 0.00000000 Rejected
## CD4_ТЕ(STAR2) 0.00000000 Rejected
## CD4_ТЕ 0.00000000 Rejected
## CD4_ТЕ_TH1 0.02020202 Rejected
## CD4_ТЕ_TH17 0.00000000 Rejected
## CD4_ТЕ_TH17TO1 0.00000000 Rejected
## CD4_ТЕ_TH2 0.00000000 Rejected
## CD4_ТЕ_TH22 0.00000000 Rejected
## CD4_ТЕ+226+ 0.00000000 Rejected
## CD4_ТЕ+39+ 0.00000000 Rejected
## CD4_ТЕ+DR+ 0.00000000 Rejected
## CD4_ТЕ+PD-1+ 0.00000000 Rejected
## CD4_ТЕ+PD-1+TIGIT- 0.00000000 Rejected
## CD4_ТЕ+PD-1+TIGIT+ 0.00000000 Rejected
## CD4_ТЕ+PD-1-TIGIT- 0.00000000 Rejected
## CD4_ТЕ+PD-1-TIGIT+ 0.00000000 Rejected
## CD4_ТЕ+TIGIT+ 0.00000000 Rejected
## CD4_ТМ 0.50505051 Tentative
## CD8+_(IM STAT) 0.01010101 Rejected
## CD8+_ 0.66666667 Tentative
## CD8+_226+ 0.31313131 Tentative
## CD8+_39+ 0.00000000 Rejected
## CD8+_DR+ 0.06060606 Rejected
## CD8+_PD-1+ 0.01010101 Rejected
## CD8+_PD-1+TIGIT- 0.00000000 Rejected
## CD8+_PD-1+TIGIT+ 0.00000000 Rejected
## CD8+_PD-1-TIGIT- 0.00000000 Rejected
## CD8+_PD-1-TIGIT+ 0.00000000 Rejected
## CD8+_TIGIT+ 0.00000000 Rejected
## CD8_EMTM 0.57575758 Tentative
## CD8_EMTM+226+ 0.49494949 Tentative
## CD8_EMTM+39+ 0.00000000 Rejected
## CD8_EMTM+DR+ 0.73737374 Confirmed
## CD8_EMTM+PD-1+ 0.66666667 Tentative
## CD8_EMTM+PD-1+TIGIT- 0.00000000 Rejected
## CD8_EMTM+PD-1+TIGIT+ 0.62626263 Tentative
## CD8_EMTM+PD-1-TIGIT- 0.14141414 Rejected
## CD8_EMTM+PD-1-TIGIT+ 0.49494949 Tentative
## CD8_EMTM+TIGIT+ 0.76767677 Confirmed
## CD8_NV(STAR2) 0.01010101 Rejected
## CD8_NV 0.00000000 Rejected
## CD8_NV+226+ 0.01010101 Rejected
## CD8_NV+39+ 0.01010101 Rejected
## CD8_NV+DR+ 0.00000000 Rejected
## CD8_NV+PD-1+ 0.00000000 Rejected
## CD8_NV+PD-1+TIGIT- 0.00000000 Rejected
## CD8_NV+PD-1+TIGIT+ 0.00000000 Rejected
## CD8_NV+PD-1-TIGIT- 0.00000000 Rejected
## CD8_NV+PD-1-TIGIT+ 0.00000000 Rejected
## CD8_NV+TIGIT+ 0.00000000 Rejected
## CD8_TREG 0.79797980 Confirmed
## CD8_ЕМ 0.00000000 Rejected
## CD8_СМ(STAR2) 0.23232323 Rejected
## CD8_СМ 0.00000000 Rejected
## CD8_СМ+226+ 0.00000000 Rejected
## CD8_СМ+39+ 0.00000000 Rejected
## CD8_СМ+DR+ 0.01010101 Rejected
## CD8_СМ+PD-1+ 0.00000000 Rejected
## CD8_СМ+PD-1+TIGIT- 0.00000000 Rejected
## CD8_СМ+PD-1+TIGIT+ 0.00000000 Rejected
## CD8_СМ+PD-1-TIGIT- 0.00000000 Rejected
## CD8_СМ+PD-1-TIGIT+ 0.00000000 Rejected
## CD8_СМ+TIGIT+ 0.01010101 Rejected
## CD8_ТЕ(STAR2) 0.00000000 Rejected
## CD8_ТЕ 0.43434343 Tentative
## CD8_ТЕ+226+ 0.00000000 Rejected
## CD8_ТЕ+39+ 0.00000000 Rejected
## CD8_ТЕ+DR+ 0.00000000 Rejected
## CD8_ТЕ+PD-1+ 0.00000000 Rejected
## CD8_ТЕ+PD-1+TIGIT- 0.00000000 Rejected
## CD8_ТЕ+PD-1+TIGIT+ 0.00000000 Rejected
## CD8_ТЕ+PD-1-TIGIT- 0.00000000 Rejected
## CD8_ТЕ+PD-1-TIGIT+ 0.00000000 Rejected
## CD8_ТЕ+TIGIT+ 0.01010101 Rejected
## CD8_ТМ 0.00000000 Rejected
## TREG_CM 0.00000000 Rejected
## TREG_EMTM 0.35353535 Tentative
## TREG_NV 0.00000000 Rejected
## TREG_TE 0.71717172 Confirmed
## TREG+226+ 0.00000000 Rejected
## TREG+226+TIGIT- 0.00000000 Rejected
## TREG+226+TIGIT+ 0.01010101 Rejected
## TREG+226-TIGIT- 0.00000000 Rejected
## TREG+226-TIGIT+ 0.00000000 Rejected
## TREG+39+ 0.00000000 Rejected
## TREG+DR+ 0.02020202 Rejected
## TREG+PD-1+ 0.00000000 Rejected
## TREG+TIGIT+ 0.44444444 Tentative
# Extract important variables
important_vars <- getSelectedAttributes(boruta_result, withTentative = FALSE)
# Extract tentative variables
tentative_vars <- getSelectedAttributes(boruta_result, withTentative = TRUE)
only_tentative <- setdiff(tentative_vars, important_vars)
cat("Confirmed important variables:\n", paste(important_vars, collapse = ", "), "\n\n")
## Confirmed important variables:
## CD4+_(IM STAT), CD4+_, CD4_NV+PD-1+TIGIT+, CD4_ЕМ_TH1, CD4_ЕМ_TH17TO1, CD4_СМ_TH2, CD4_СМ_TH22, CD4_ТREG(STAR2), CD8_EMTM+DR+, CD8_EMTM+TIGIT+, CD8_TREG, TREG_TE
cat("Tentative variables:\n", paste(only_tentative, collapse = ", "), "\n")
## Tentative variables:
## CD4_TH22, CD4+_226+, CD4_NV+TIGIT+, CD4_СМ_TH1, CD4_СМ_TH17, CD4_ТREG_TH17, CD4_ТМ, CD8+_, CD8+_226+, CD8_EMTM, CD8_EMTM+226+, CD8_EMTM+PD-1+, CD8_EMTM+PD-1+TIGIT+, CD8_EMTM+PD-1-TIGIT+, CD8_ТЕ, TREG_EMTM, TREG+TIGIT+
selected_columns <- c(
"CD4+_(IM STAT)", "CD4+_", "CD4_NV+PD-1+TIGIT+", "CD4_ЕМ_TH1",
"CD4_ЕМ_TH17TO1", "CD4_СМ_TH2", "CD4_СМ_TH22", "CD4_ТREG(STAR2)",
"CD8+_", "CD8_EMTM+DR+", "CD8_EMTM+TIGIT+", "CD8_TREG", "TREG_TE",
"Subject_ID", "Observation_Days_Count", "cGVHD_Diagnosis_Day",
"cGVHD_flag", "Blood_test_day"
)
df_cGVHD_prediction_selected <- df_cGVHD_prediction %>%
select(all_of(selected_columns))
# Add additional columns
additional_columns <- c(
"CD4_NV+TIGIT+", "CD4_СМ_TH1", "CD4_СМ_TH17", "CD4_ТREG_TH17",
"CD4_ТМ", "CD8+_DR+", "CD8_EMTM", "CD8_EMTM+226+",
"CD8_EMTM+PD-1+", "CD8_EMTM+PD-1+TIGIT+", "CD8_EMTM+PD-1-TIGIT+",
"CD8_ТЕ", "TREG+226+TIGIT+", "TREG+DR+", "TREG+TIGIT+"
)
df_cGVHD_prediction_selected_plus <- df_cGVHD_prediction %>%
select(all_of(c(selected_columns, additional_columns)))
Clusterisation features
df_normalized <- df_cGVHD_prediction_selected_plus %>%
filter(Blood_test_day %in% c("90", "180", "365", "cGVHD")) %>%
# Ensure cGVHD_flag is retained for later use
select(cGVHD_flag, where(is.numeric), -Observation_Days_Count, -cGVHD_Diagnosis_Day) %>%
mutate(across(where(is.numeric), ~ (.-mean(.)) / sd(.), .names = "z_{col}"))
# Save cGVHD_flag for later use
cGVHD_flags <- df_normalized$cGVHD_flag
# Perform near-zero variance filtering (excluding cGVHD_flag)
nzv <- nearZeroVar(df_normalized %>% select(-cGVHD_flag))
df_filtered <- df_normalized %>%
select(-nzv, cGVHD_flag) %>% # Retain cGVHD_flag after filtering
filter(complete.cases(.))
# Perform PCA (exclude cGVHD_flag from PCA computation)
pca_results <- prcomp(df_filtered %>% select(-cGVHD_flag), scale. = TRUE)
# Attach PCA scores and cGVHD_flag back together
pca_scores <- as.data.frame(pca_results$x) %>%
mutate(cGVHD_flag = cGVHD_flags)
# Explained variance calculations
explained_variance <- pca_results$sdev^2 / sum(pca_results$sdev^2)
cumulative_variance <- cumsum(explained_variance)
# Create a data frame for plotting explained variance
explained_variance_data <- data.frame(
PC = 1:length(explained_variance),
Explained_Variance = explained_variance,
Cumulative_Variance = cumulative_variance
)
# Filter for components contributing to the first 80% of cumulative variance
explained_variance_data_filtered <- explained_variance_data %>%
filter(Cumulative_Variance <= 0.9)
# Add percentage labels for explained variance
explained_variance_data_filtered <- explained_variance_data_filtered %>%
mutate(Variance_Percentage_Label = paste0(round(Explained_Variance * 100, 2), "%"))
# Plot explained variance with percentage labels
ggplot(explained_variance_data_filtered, aes(x = PC)) +
geom_bar(aes(y = Explained_Variance), stat = "identity", fill = "steelblue") +
geom_text(aes(y = Explained_Variance, label = Variance_Percentage_Label),
vjust = -0.5, size = 3.5) +
geom_line(aes(y = Cumulative_Variance), color = "red", size = 1) +
geom_point(aes(y = Cumulative_Variance), color = "red", size = 2) +
scale_y_continuous(
name = "Variance Explained",
sec.axis = sec_axis(~., name = "Cumulative Variance Explained")
) +
labs(
title = "Explained Variance by Principal Components (First 80%)",
x = "Principal Component"
) +
theme_minimal(base_size = 14)
# Perform clustering on PCA scores
set.seed(123)
wss <- sapply(1:10, function(k) {
kmeans(pca_scores[, 1:10], centers = k, nstart = 25)$tot.withinss
})
# Plot Elbow Method
elbow_plot <- data.frame(Clusters = 1:10, WSS = wss)
ggplot(elbow_plot, aes(x = Clusters, y = WSS)) +
geom_line() +
geom_point(size = 3) +
labs(
title = "Elbow Method for Optimal Clusters",
x = "Number of Clusters",
y = "Total Within-Cluster Sum of Squares (WSS)"
) +
theme_minimal(base_size = 14)
# Apply K-means clustering
optimal_clusters <- 4
kmeans_result <- kmeans(pca_scores[, 1:10], centers = optimal_clusters, nstart = 25)
pca_scores$Cluster <- as.factor(kmeans_result$cluster)
# Visualize clusters with coloring by cGVHD_flag
ggplot(pca_scores, aes(x = PC1, y = PC2, color = as.factor(cGVHD_flag))) +
geom_point(size = 2, alpha = 0.8) + # Scatterplot of PCA points
stat_ellipse(aes(group = Cluster), type = "t", linetype = "dashed", size = 1, color = "black") + # Ellipses for clusters
scale_color_brewer(palette = "Set1") + # Color palette for cGVHD_flag
labs(
title = "PCA Clusters with cGVHD_flag Coloring and Cluster Ellipses",
x = "PC1 (Principal Component 1)",
y = "PC2 (Principal Component 2)",
color = "cGVHD_flag"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold", hjust = 0.5)
)
# Group by Cluster and cGVHD_flag to count occurrences
cluster_cGVHD_counts <- pca_scores %>%
group_by(Cluster, cGVHD_flag) %>%
summarise(Count = n(), .groups = "drop") %>%
arrange(Cluster, cGVHD_flag)
# View the summarized table
print(cluster_cGVHD_counts)
## # A tibble: 8 × 3
## Cluster cGVHD_flag Count
## <fct> <fct> <int>
## 1 1 0 3
## 2 1 1 25
## 3 2 0 3
## 4 2 1 1
## 5 3 0 19
## 6 3 1 11
## 7 4 0 55
## 8 4 1 55
df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
mutate(
cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
Blood_test_day = ifelse(Blood_test_day == "cGVHD", cGVHD_Diagnosis_Day, Blood_test_day),
Blood_test_day = as.numeric(as.character(Blood_test_day)),
cGVHD_flag = as.numeric(as.character(cGVHD_flag))
) %>%
select(-Observation_Days_Count) %>%
select(-Subject_ID)
# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
filter(complete.cases(.)) # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 50) {
results <- data.frame(Trees = integer(), OOB_Error = numeric())
for (ntree in seq(50, max_trees, by = step)) {
set.seed(123) # Ensure reproducibility
rf_model <- rfsrc(
formula = formula,
data = data,
mtry = round(sqrt(ncol(data) - 2)),
nodesize = 15,
ntree = ntree, # Number of trees
importance = TRUE
)
# Extract the OOB Requested performance error
oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
results <- rbind(results, data.frame(
Trees = ntree,
OOB_Error = oob_error
))
}
return(results)
}
# Apply the function to your dataset
oob_results <- evaluate_trees(
data = dataSet, # Replace with your dataset
formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
max_trees = 1000,
step = 50
)
ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
geom_line() +
geom_point(size = 2) +
labs(
title = "OOB Requested Performance Error vs. Number of Trees",
x = "Number of Trees",
y = "OOB Requested Performance Error"
) +
theme_minimal()
# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]
print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 500"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
dataSet,
ntree = best_trees,
mtry = round(sqrt(ncol(dataSet) - 2)),
nodesize = 15,
membership = TRUE,
importance=TRUE)
# Print the Random Survival Forest object
print(RF_obj)
## Sample size: 172
## Number of deaths: 92
## Number of trees: 500
## Forest terminal node size: 15
## Average no. of terminal nodes: 9.864
## No. of variables tried at each split: 13
## Total no. of variables: 165
## Resampling used to grow trees: swor
## Resample size used to grow trees: 109
## Analysis: RSF
## Family: surv
## Splitting rule: logrank *random*
## Number of random split points: 10
## (OOB) CRPS: 74.62375368
## (OOB) stand. CRPS: 0.1486529
## (OOB) Requested performance error: 0.33273494
# Create a hypothetical observation for prediction
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names
newdata [,which(RF_obj$xvar.names == "peak_vo2")] <- quantile(RF_obj$xvar$peak_vo2, 0.25)
# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0,6,1,1), mgp = c(4, 1, 0))
# Plot predicted survival probability over time
plot(
round(y.pred$time.interest, 2),
y.pred$survival[1, ],
type = "l",
xlab = "Time (Days)",
ylab = "Survival Probability",
col = 1,
lty = 1,
lwd = 2,
main = "Predicted Survival Curve"
)
# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score
# Plot the Brier score
plot(
bs.km,
type = "s",
col = 2,
ylab = "Brier Score",
xlab = "Time (Days)",
main = "Brier Score Over Time"
)
Initially, the Brier score is low, reflecting good prediction accuracy in the early stages.
The score increases over time, indicating reduced prediction accuracy as time progresses.
Beyond ~300 days, the Brier score stabilizes.
< 0.1 - excellent
<= 0.2 - superior
<=0.3 - adequate
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance
# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
Variable = names(vimp),
VIMP = vimp
)
# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
filter(VIMP > 0) %>%
arrange(desc(VIMP)) %>%
head(30)
# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Variable Importance (VIMP) in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 8)
# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
filter(VIMP < 0) %>%
arrange(VIMP) # Sort in ascending order
# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(
title = "Variables with Negative VIMP in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 8)
# Compare with VIMP
vimp_boruta_comparison <- vimp_df %>%
mutate(
Boruta_Status = case_when(
Variable %in% important_vars ~ "Confirmed Important",
Variable %in% tentative_vars ~ "Tentative",
TRUE ~ "Rejected"
)
)
# Filter for top N variables based on absolute VIMP
top_n <- 30 # Adjust this number as needed
vimp_boruta_comparison_filtered <- vimp_boruta_comparison %>%
arrange(desc(VIMP)) %>%
head(top_n)
# Plot Comparison for the filtered variables
ggplot(vimp_boruta_comparison_filtered, aes(x = reorder(Variable, VIMP), y = VIMP, fill = Boruta_Status)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = paste("Top", top_n, "Variables: VIMP and Boruta Comparison"),
x = "Variables",
y = "VIMP",
fill = "Boruta Status"
) +
theme_minimal(base_size = 10) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.y = element_text(size = 8)
)
df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
mutate(
cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
cGVHD_flag = as.numeric(as.character(cGVHD_flag))
) %>%
select(-Observation_Days_Count) %>%
filter(Blood_test_day %in% c("90")) %>%
select(-Subject_ID, -Blood_test_day)
# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
filter(complete.cases(.)) # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 50) {
results <- data.frame(Trees = integer(), OOB_Error = numeric())
for (ntree in seq(50, max_trees, by = step)) {
set.seed(123) # Ensure reproducibility
rf_model <- rfsrc(
formula = formula,
data = data,
mtry = round(sqrt(ncol(data) - 2)),
nodesize = 15,
ntree = ntree, # Number of trees
importance = TRUE
)
# Extract the OOB Requested performance error
oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
results <- rbind(results, data.frame(
Trees = ntree,
OOB_Error = oob_error
))
}
return(results)
}
# Apply the function to your dataset
oob_results <- evaluate_trees(
data = dataSet, # Replace with your dataset
formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
max_trees = 1000,
step = 50
)
ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
geom_line() +
geom_point(size = 2) +
labs(
title = "OOB Requested Performance Error vs. Number of Trees",
x = "Number of Trees",
y = "OOB Requested Performance Error"
) +
theme_minimal()
# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]
print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 100"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
dataSet,
ntree = best_trees,
mtry = round(sqrt(ncol(dataSet) - 2)),
nodesize = 15,
membership = TRUE,
importance=TRUE)
# Print the Random Survival Forest object
print(RF_obj)
## Sample size: 67
## Number of deaths: 28
## Number of trees: 100
## Forest terminal node size: 15
## Average no. of terminal nodes: 4.22
## No. of variables tried at each split: 13
## Total no. of variables: 164
## Resampling used to grow trees: swor
## Resample size used to grow trees: 42
## Analysis: RSF
## Family: surv
## Splitting rule: logrank *random*
## Number of random split points: 10
## (OOB) CRPS: 81.34704027
## (OOB) stand. CRPS: 0.1620459
## (OOB) Requested performance error: 0.45494186
A lower CRPS indicates better performance.
# Create a hypothetical observation for prediction
# Creating an hypothetical observation
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names
# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])
# Plot predicted survival probability over time
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0, 6, 1, 1), mgp = c(4, 1, 0))
plot(
round(y.pred$time.interest, 2),
y.pred$survival[1, ],
type = "l",
xlab = "Time (Days)",
ylab = "Survival Probability",
col = 1,
lty = 1,
lwd = 2,
main = "Predicted Survival Curve"
)
# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score
# Plot the Brier score
plot(
bs.km,
type = "s",
col = 2,
ylab = "Brier Score",
xlab = "Time (Days)",
main = "Brier Score Over Time"
)
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance
# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
Variable = names(vimp),
VIMP = vimp
)
# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
filter(VIMP > 0) %>%
arrange(desc(VIMP))
# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Variable Importance (VIMP) in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 5)
# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
filter(VIMP < 0) %>%
arrange(VIMP) # Sort in ascending order
# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(
title = "Variables with Negative VIMP in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 6)
df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
mutate(
cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
cGVHD_flag = as.numeric(as.character(cGVHD_flag))
) %>%
select(-Observation_Days_Count) %>%
filter(Blood_test_day %in% c("180")) %>%
select(-Subject_ID, -Blood_test_day)
# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
filter(complete.cases(.)) # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 50) {
results <- data.frame(Trees = integer(), OOB_Error = numeric())
for (ntree in seq(50, max_trees, by = step)) {
set.seed(123) # Ensure reproducibility
rf_model <- rfsrc(
formula = formula,
data = data,
mtry = round(sqrt(ncol(data) - 2)),
nodesize = 15,
ntree = ntree, # Number of trees
importance = TRUE
)
# Extract the OOB Requested performance error
oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
results <- rbind(results, data.frame(
Trees = ntree,
OOB_Error = oob_error
))
}
return(results)
}
# Apply the function to your dataset
oob_results <- evaluate_trees(
data = dataSet, # Replace with your dataset
formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
max_trees = 1000,
step = 50
)
ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
geom_line() +
geom_point(size = 2) +
labs(
title = "OOB Requested Performance Error vs. Number of Trees",
x = "Number of Trees",
y = "OOB Requested Performance Error"
) +
theme_minimal()
# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]
print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 350"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
dataSet,
ntree = best_trees,
mtry = round(sqrt(ncol(dataSet) - 2)),
nodesize = 15,
membership = TRUE,
importance=TRUE)
# Print the Random Survival Forest object
print(RF_obj)
## Sample size: 54
## Number of deaths: 26
## Number of trees: 350
## Forest terminal node size: 15
## Average no. of terminal nodes: 2.9657
## No. of variables tried at each split: 13
## Total no. of variables: 164
## Resampling used to grow trees: swor
## Resample size used to grow trees: 34
## Analysis: RSF
## Family: surv
## Splitting rule: logrank *random*
## Number of random split points: 10
## (OOB) CRPS: 92.42719353
## (OOB) stand. CRPS: 0.18411792
## (OOB) Requested performance error: 0.621139
A lower CRPS indicates better performance.
# Create a hypothetical observation for prediction
# Creating an hypothetical observation
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names
# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])
# Plot predicted survival probability over time
plot(
round(y.pred$time.interest, 2),
y.pred$survival[1, ],
type = "l",
xlab = "Time (Days)",
ylab = "Survival Probability",
col = 1,
lty = 1,
lwd = 2,
main = "Predicted Survival Curve"
)
# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score
# Plot the Brier score
plot(
bs.km,
type = "s",
col = 2,
ylab = "Brier Score",
xlab = "Time (Days)",
main = "Brier Score Over Time"
)
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance
# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
Variable = names(vimp),
VIMP = vimp
)
# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
filter(VIMP > 0) %>%
arrange(desc(VIMP))
# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Variable Importance (VIMP) in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 4)
# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
filter(VIMP < 0) %>%
arrange(VIMP) # Sort in ascending order
# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(
title = "Variables with Negative VIMP in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 4)
df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
mutate(
cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
Blood_test_day = ifelse(Blood_test_day == "cGVHD", cGVHD_Diagnosis_Day, Blood_test_day),
Blood_test_day = as.numeric(as.character(Blood_test_day)),
cGVHD_flag = as.numeric(as.character(cGVHD_flag))
) %>%
select(-Observation_Days_Count) %>%
filter(Blood_test_day %in% c("365")) %>%
select(-Subject_ID, -Blood_test_day)
# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
filter(complete.cases(.)) # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 20) {
results <- data.frame(Trees = integer(), OOB_Error = numeric())
for (ntree in seq(20, max_trees, by = step)) {
set.seed(123) # Ensure reproducibility
rf_model <- rfsrc(
formula = formula,
data = data,
mtry = round(sqrt(ncol(data) - 2)),
nodesize = 15,
ntree = ntree, # Number of trees
importance = TRUE
)
# Extract the OOB Requested performance error
oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
results <- rbind(results, data.frame(
Trees = ntree,
OOB_Error = oob_error
))
}
return(results)
}
# Apply the function to your dataset
oob_results <- evaluate_trees(
data = dataSet, # Replace with your dataset
formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
max_trees = 1000,
step = 50
)
ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
geom_line() +
geom_point(size = 2) +
labs(
title = "OOB Requested Performance Error vs. Number of Trees",
x = "Number of Trees",
y = "OOB Requested Performance Error"
) +
theme_minimal()
# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]
print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 20"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
dataSet,
ntree = best_trees,
mtry = round(sqrt(ncol(dataSet) - 2)),
nodesize = 15,
membership = TRUE,
importance=TRUE)
# Print the Random Survival Forest object
print(RF_obj)
## Sample size: 26
## Number of deaths: 13
## Number of trees: 20
## Forest terminal node size: 15
## Average no. of terminal nodes: 1
## No. of variables tried at each split: 13
## Total no. of variables: 164
## Resampling used to grow trees: swor
## Resample size used to grow trees: 16
## Analysis: RSF
## Family: surv
## Splitting rule: logrank *random*
## Number of random split points: 10
## (OOB) CRPS: 89.22882723
## (OOB) stand. CRPS: 0.17774667
## (OOB) Requested performance error: 0.84552846
A lower CRPS indicates better performance.
# Create a hypothetical observation for prediction
# Creating an hypothetical observation
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names
# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])
# Plot predicted survival probability over time
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0, 6, 1, 1), mgp = c(4, 1, 0))
plot(
round(y.pred$time.interest, 2),
y.pred$survival[1, ],
type = "l",
xlab = "Time (Days)",
ylab = "Survival Probability",
col = 1,
lty = 1,
lwd = 2,
main = "Predicted Survival Curve"
)
# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score
# Plot the Brier score
plot(
bs.km,
type = "s",
col = 2,
ylab = "Brier Score",
xlab = "Time (Days)",
main = "Brier Score Over Time"
)
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance
# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
Variable = names(vimp),
VIMP = vimp
)
# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
filter(VIMP > 0) %>%
arrange(desc(VIMP))
# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Variable Importance (VIMP) in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 5)
# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
filter(VIMP < 0) %>%
arrange(VIMP) # Sort in ascending order
# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(
title = "Variables with Negative VIMP in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 6)
df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
mutate(
cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
cGVHD_flag = as.numeric(as.character(cGVHD_flag))
) %>%
select(-Observation_Days_Count) %>%
filter(Blood_test_day %in% c("cGVHD")) %>%
select(-Subject_ID, -Blood_test_day)
# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
filter(complete.cases(.)) # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 20) {
results <- data.frame(Trees = integer(), OOB_Error = numeric())
for (ntree in seq(10, max_trees, by = step)) {
set.seed(123) # Ensure reproducibility
rf_model <- rfsrc(
formula = formula,
data = data,
mtry = round(sqrt(ncol(data) - 2)),
nodesize = 15,
ntree = ntree, # Number of trees
importance = TRUE
)
# Extract the OOB Requested performance error
oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
results <- rbind(results, data.frame(
Trees = ntree,
OOB_Error = oob_error
))
}
return(results)
}
# Apply the function to your dataset
oob_results <- evaluate_trees(
data = dataSet, # Replace with your dataset
formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
max_trees = 1000,
step = 50
)
ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
geom_line() +
geom_point(size = 2) +
labs(
title = "OOB Requested Performance Error vs. Number of Trees",
x = "Number of Trees",
y = "OOB Requested Performance Error"
) +
theme_minimal()
# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]
print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 10"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
dataSet,
ntree = best_trees,
mtry = round(sqrt(ncol(dataSet) - 2)),
nodesize = 15,
membership = TRUE,
importance=TRUE)
# Print the Random Survival Forest object
print(RF_obj)
## Sample size: 25
## Number of deaths: 25
## Number of trees: 10
## Forest terminal node size: 15
## Average no. of terminal nodes: 1
## No. of variables tried at each split: 13
## Total no. of variables: 164
## Resampling used to grow trees: swor
## Resample size used to grow trees: 16
## Analysis: RSF
## Family: surv
## Splitting rule: logrank *random*
## Number of random split points: 10
## (OOB) CRPS: 39.19364759
## (OOB) stand. CRPS: 0.078075
## (OOB) Requested performance error: 0.64166667
A lower CRPS indicates better performance.
# Create a hypothetical observation for prediction
# Creating an hypothetical observation
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names
# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])
# Plot predicted survival probability over time
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0, 6, 1, 1), mgp = c(4, 1, 0))
plot(
round(y.pred$time.interest, 2),
y.pred$survival[1, ],
type = "l",
xlab = "Time (Days)",
ylab = "Survival Probability",
col = 1,
lty = 1,
lwd = 2,
main = "Predicted Survival Curve"
)
# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score
# Plot the Brier score
plot(
bs.km,
type = "s",
col = 2,
ylab = "Brier Score",
xlab = "Time (Days)",
main = "Brier Score Over Time"
)
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance
# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
Variable = names(vimp),
VIMP = vimp
)
# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
filter(VIMP > 0) %>%
arrange(desc(VIMP))
# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Variable Importance (VIMP) in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 5)
# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
filter(VIMP < 0) %>%
arrange(VIMP) # Sort in ascending order
# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(
title = "Variables with Negative VIMP in RSF",
x = "Variables",
y = "VIMP"
) +
theme_minimal(base_size = 6)
Comparison on the day of the onset of chronic GVHD with the closest points in terms of timing in patients without chronic GVHD.
Group 1: Patients diagnosed with cGVHD (cGVHD_flag = 1). Group 2: Patients without cGVHD (cGVHD_flag = 0).
Due to large ammount of immune cell markers (332):
# Plot the density plot for cGVHD Diagnosis Day
df_transformed %>%
filter(cGVHD_flag == 1) %>%
ggplot(aes(x = cGVHD_Diagnosis_Day)) +
geom_density(fill = "blue", alpha = 0.7) +
theme_minimal() +
labs(
title = "Distribution of cGVHD Diagnosis Day",
x = "cGVHD Diagnosis Day",
y = "Density"
)